package cn.doitedu.ml.loss

import cn.doitedu.commons.util.SparkUtil
import cn.doitedu.ml.util.VecUtil.arr2Vec
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.LogisticRegression

/**
 * @date: 2020/2/23
 * @site: www.doitedu.cn
 * @author: hunter.d 涛哥
 * @qq: 657270652
 * @description: 流失风险标签计算模型训练
  *     算法：逻辑回归分类算法
 */
object LossPredict {

  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    val spark = SparkUtil.getSparkSession(this.getClass.getSimpleName)


    // 加载样本数据
    val sample = spark.read.option("header","true").option("inferSchema",true).csv("userprofile/data/loss_predict/sample/liushi_sample.csv")
    sample.show(100,false)

    // 将样本数据向量化
    spark.udf.register("arr2Vec",arr2Vec)
    val features = sample.selectExpr("label","gid","arr2vec(array(3_cs,15_cs,3_xf,15_xf,3_th,15_th,3_hp,15_hp,3_cp,15_cp,last_dl,last_xf)) as features")


    // 构造逻辑回归算法工具
    val logistic = new LogisticRegression()
        .setFeaturesCol("features")
        .setLabelCol("label")

    // 训练模型
    val model = logistic.fit(features)

    // 对测试数据进行预测，评估效果
    val test = spark.read.option("header","true").option("inferSchema",true).csv("userprofile/data/loss_predict/test")
    test.show(100,false)
    val test_features = test.selectExpr("gid","arr2vec(array(3_cs,15_cs,3_xf,15_xf,3_th,15_th,3_hp,15_hp,3_cp,15_cp,last_dl,last_xf)) as features")

    val predict = model.transform(test_features)
    predict.drop("features").show(100,false)

    spark.close()
  }
}
